1   package org.apache.lucene.index;
2   
3   /*
4    * Licensed to the Apache Software Foundation (ASF) under one or more
5    * contributor license agreements.  See the NOTICE file distributed with
6    * this work for additional information regarding copyright ownership.
7    * The ASF licenses this file to You under the Apache License, Version 2.0
8    * (the "License"); you may not use this file except in compliance with
9    * the License.  You may obtain a copy of the License at
10   *
11   *     http://www.apache.org/licenses/LICENSE-2.0
12   *
13   * Unless required by applicable law or agreed to in writing, software
14   * distributed under the License is distributed on an "AS IS" BASIS,
15   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16   * See the License for the specific language governing permissions and
17   * limitations under the License.
18   */
19  
20  import java.io.IOException;
21  import java.util.ArrayList;
22  import java.util.Collections;
23  import java.util.List;
24  import java.util.Map;
25  
26  import org.apache.lucene.store.Directory;
27  import org.apache.lucene.store.MergeInfo;
28  import org.apache.lucene.store.RateLimiter;
29  import org.apache.lucene.util.FixedBitSet;
30  
31  /**
32   * <p>Expert: a MergePolicy determines the sequence of
33   * primitive merge operations.</p>
34   * 
35   * <p>Whenever the segments in an index have been altered by
36   * {@link IndexWriter}, either the addition of a newly
37   * flushed segment, addition of many segments from
38   * addIndexes* calls, or a previous merge that may now need
39   * to cascade, {@link IndexWriter} invokes {@link
40   * #findMerges} to give the MergePolicy a chance to pick
41   * merges that are now required.  This method returns a
42   * {@link MergeSpecification} instance describing the set of
43   * merges that should be done, or null if no merges are
44   * necessary.  When IndexWriter.forceMerge is called, it calls
45   * {@link #findForcedMerges(SegmentInfos,int,Map, IndexWriter)} and the MergePolicy should
46   * then return the necessary merges.</p>
47   *
48   * <p>Note that the policy can return more than one merge at
49   * a time.  In this case, if the writer is using {@link
50   * SerialMergeScheduler}, the merges will be run
51   * sequentially but if it is using {@link
52   * ConcurrentMergeScheduler} they will be run concurrently.</p>
53   * 
54   * <p>The default MergePolicy is {@link
55   * TieredMergePolicy}.</p>
56   *
57   * @lucene.experimental
58   */
59  public abstract class MergePolicy {
60  
61    /** A map of doc IDs. */
62    public static abstract class DocMap {
63      /** Sole constructor, typically invoked from sub-classes constructors. */
64      protected DocMap() {}
65  
66      /** Return the new doc ID according to its old value. */
67      public abstract int map(int old);
68  
69      /** Useful from an assert. */
70      boolean isConsistent(int maxDoc) {
71        final FixedBitSet targets = new FixedBitSet(maxDoc);
72        for (int i = 0; i < maxDoc; ++i) {
73          final int target = map(i);
74          if (target < 0 || target >= maxDoc) {
75            assert false : "out of range: " + target + " not in [0-" + maxDoc + "[";
76            return false;
77          } else if (targets.get(target)) {
78            assert false : target + " is already taken (" + i + ")";
79            return false;
80          }
81        }
82        return true;
83      }
84    }
85  
86    /** OneMerge provides the information necessary to perform
87     *  an individual primitive merge operation, resulting in
88     *  a single new segment.  The merge spec includes the
89     *  subset of segments to be merged as well as whether the
90     *  new segment should use the compound file format.
91     *
92     * @lucene.experimental */
93    public static class OneMerge {
94  
95      SegmentCommitInfo info;         // used by IndexWriter
96      boolean registerDone;           // used by IndexWriter
97      long mergeGen;                  // used by IndexWriter
98      boolean isExternal;             // used by IndexWriter
99      int maxNumSegments = -1;        // used by IndexWriter
100 
101     /** Estimated size in bytes of the merged segment. */
102     public volatile long estimatedMergeBytes;       // used by IndexWriter
103 
104     // Sum of sizeInBytes of all SegmentInfos; set by IW.mergeInit
105     volatile long totalMergeBytes;
106 
107     List<SegmentReader> readers;        // used by IndexWriter
108 
109     /** Segments to be merged. */
110     public final List<SegmentCommitInfo> segments;
111 
112     /** A private {@link RateLimiter} for this merge, used to rate limit writes and abort. */
113     public final MergeRateLimiter rateLimiter;
114 
115     volatile long mergeStartNS = -1;
116 
117     /** Total number of documents in segments to be merged, not accounting for deletions. */
118     public final int totalMaxDoc;
119     Throwable error;
120 
121     /** Sole constructor.
122      * @param segments List of {@link SegmentCommitInfo}s
123      *        to be merged. */
124     public OneMerge(List<SegmentCommitInfo> segments) {
125       if (0 == segments.size()) {
126         throw new RuntimeException("segments must include at least one segment");
127       }
128       // clone the list, as the in list may be based off original SegmentInfos and may be modified
129       this.segments = new ArrayList<>(segments);
130       int count = 0;
131       for(SegmentCommitInfo info : segments) {
132         count += info.info.maxDoc();
133       }
134       totalMaxDoc = count;
135 
136       rateLimiter = new MergeRateLimiter(this);
137     }
138 
139     /** Called by {@link IndexWriter} after the merge is done and all readers have been closed. */
140     public void mergeFinished() throws IOException {
141     }
142 
143     /** Expert: Get the list of readers to merge. Note that this list does not
144      *  necessarily match the list of segments to merge and should only be used
145      *  to feed SegmentMerger to initialize a merge. When a {@link OneMerge}
146      *  reorders doc IDs, it must override {@link #getDocMap} too so that
147      *  deletes that happened during the merge can be applied to the newly
148      *  merged segment. */
149     public List<CodecReader> getMergeReaders() throws IOException {
150       if (readers == null) {
151         throw new IllegalStateException("IndexWriter has not initialized readers from the segment infos yet");
152       }
153       final List<CodecReader> readers = new ArrayList<>(this.readers.size());
154       for (SegmentReader reader : this.readers) {
155         if (reader.numDocs() > 0) {
156           readers.add(reader);
157         }
158       }
159       return Collections.unmodifiableList(readers);
160     }
161     
162     /**
163      * Expert: Sets the {@link SegmentCommitInfo} of the merged segment.
164      * Allows sub-classes to e.g. set diagnostics properties.
165      */
166     public void setMergeInfo(SegmentCommitInfo info) {
167       this.info = info;
168     }
169 
170     /**
171      * Returns the {@link SegmentCommitInfo} for the merged segment,
172      * or null if it hasn't been set yet.
173      */
174     public SegmentCommitInfo getMergeInfo() {
175       return info;
176     }
177 
178     /** Expert: If {@link #getMergeReaders()} reorders document IDs, this method
179      *  must be overridden to return a mapping from the <i>natural</i> doc ID
180      *  (the doc ID that would result from a natural merge) to the actual doc
181      *  ID. This mapping is used to apply deletions that happened during the
182      *  merge to the new segment. */
183     public DocMap getDocMap(MergeState mergeState) {
184       return new DocMap() {
185         @Override
186         public int map(int docID) {
187           return docID;
188         }
189       };
190     }
191 
192     /** Record that an exception occurred while executing
193      *  this merge */
194     synchronized void setException(Throwable error) {
195       this.error = error;
196     }
197 
198     /** Retrieve previous exception set by {@link
199      *  #setException}. */
200     synchronized Throwable getException() {
201       return error;
202     }
203 
204     /** Returns a readable description of the current merge
205      *  state. */
206     public String segString() {
207       StringBuilder b = new StringBuilder();
208       final int numSegments = segments.size();
209       for(int i=0;i<numSegments;i++) {
210         if (i > 0) {
211           b.append(' ');
212         }
213         b.append(segments.get(i).toString());
214       }
215       if (info != null) {
216         b.append(" into ").append(info.info.name);
217       }
218       if (maxNumSegments != -1) {
219         b.append(" [maxNumSegments=" + maxNumSegments + "]");
220       }
221       if (rateLimiter.getAbort()) {
222         b.append(" [ABORTED]");
223       }
224       return b.toString();
225     }
226     
227     /**
228      * Returns the total size in bytes of this merge. Note that this does not
229      * indicate the size of the merged segment, but the
230      * input total size. This is only set once the merge is
231      * initialized by IndexWriter.
232      */
233     public long totalBytesSize() throws IOException {
234       return totalMergeBytes;
235     }
236 
237     /**
238      * Returns the total number of documents that are included with this merge.
239      * Note that this does not indicate the number of documents after the merge.
240      * */
241     public int totalNumDocs() throws IOException {
242       int total = 0;
243       for (SegmentCommitInfo info : segments) {
244         total += info.info.maxDoc();
245       }
246       return total;
247     }
248 
249     /** Return {@link MergeInfo} describing this merge. */
250     public MergeInfo getStoreMergeInfo() {
251       return new MergeInfo(totalMaxDoc, estimatedMergeBytes, isExternal, maxNumSegments);
252     }    
253   }
254 
255   /**
256    * A MergeSpecification instance provides the information
257    * necessary to perform multiple merges.  It simply
258    * contains a list of {@link OneMerge} instances.
259    */
260 
261   public static class MergeSpecification {
262 
263     /**
264      * The subset of segments to be included in the primitive merge.
265      */
266 
267     public final List<OneMerge> merges = new ArrayList<>();
268 
269     /** Sole constructor.  Use {@link
270      *  #add(MergePolicy.OneMerge)} to add merges. */
271     public MergeSpecification() {
272     }
273 
274     /** Adds the provided {@link OneMerge} to this
275      *  specification. */
276     public void add(OneMerge merge) {
277       merges.add(merge);
278     }
279 
280     /** Returns a description of the merges in this
281     *  specification. */
282     public String segString(Directory dir) {
283       StringBuilder b = new StringBuilder();
284       b.append("MergeSpec:\n");
285       final int count = merges.size();
286       for(int i=0;i<count;i++) {
287         b.append("  ").append(1 + i).append(": ").append(merges.get(i).segString());
288       }
289       return b.toString();
290     }
291   }
292 
293   /** Exception thrown if there are any problems while
294    *  executing a merge. */
295   public static class MergeException extends RuntimeException {
296     private Directory dir;
297 
298     /** Create a {@code MergeException}. */
299     public MergeException(String message, Directory dir) {
300       super(message);
301       this.dir = dir;
302     }
303 
304     /** Create a {@code MergeException}. */
305     public MergeException(Throwable exc, Directory dir) {
306       super(exc);
307       this.dir = dir;
308     }
309 
310     /** Returns the {@link Directory} of the index that hit
311      *  the exception. */
312     public Directory getDirectory() {
313       return dir;
314     }
315   }
316 
317   /** Thrown when a merge was explicity aborted because
318    *  {@link IndexWriter#abortMerges} was called.  Normally
319    *  this exception is privately caught and suppresed by
320    *  {@link IndexWriter}. */
321   public static class MergeAbortedException extends IOException {
322     /** Create a {@link MergeAbortedException}. */
323     public MergeAbortedException() {
324       super("merge is aborted");
325     }
326 
327     /** Create a {@link MergeAbortedException} with a
328      *  specified message. */
329     public MergeAbortedException(String message) {
330       super(message);
331     }
332   }
333   
334   /**
335    * Default ratio for compound file system usage. Set to <tt>1.0</tt>, always use 
336    * compound file system.
337    */
338   protected static final double DEFAULT_NO_CFS_RATIO = 1.0;
339 
340   /**
341    * Default max segment size in order to use compound file system. Set to {@link Long#MAX_VALUE}.
342    */
343   protected static final long DEFAULT_MAX_CFS_SEGMENT_SIZE = Long.MAX_VALUE;
344 
345   /** If the size of the merge segment exceeds this ratio of
346    *  the total index size then it will remain in
347    *  non-compound format */
348   protected double noCFSRatio = DEFAULT_NO_CFS_RATIO;
349   
350   /** If the size of the merged segment exceeds
351    *  this value then it will not use compound file format. */
352   protected long maxCFSSegmentSize = DEFAULT_MAX_CFS_SEGMENT_SIZE;
353 
354   /**
355    * Creates a new merge policy instance.
356    */
357   public MergePolicy() {
358     this(DEFAULT_NO_CFS_RATIO, DEFAULT_MAX_CFS_SEGMENT_SIZE);
359   }
360   
361   /**
362    * Creates a new merge policy instance with default settings for noCFSRatio
363    * and maxCFSSegmentSize. This ctor should be used by subclasses using different
364    * defaults than the {@link MergePolicy}
365    */
366   protected MergePolicy(double defaultNoCFSRatio, long defaultMaxCFSSegmentSize) {
367     this.noCFSRatio = defaultNoCFSRatio;
368     this.maxCFSSegmentSize = defaultMaxCFSSegmentSize;
369   }
370 
371   /**
372    * Determine what set of merge operations are now necessary on the index.
373    * {@link IndexWriter} calls this whenever there is a change to the segments.
374    * This call is always synchronized on the {@link IndexWriter} instance so
375    * only one thread at a time will call this method.
376    * @param mergeTrigger the event that triggered the merge
377    * @param segmentInfos
378    *          the total set of segments in the index
379    * @param writer the IndexWriter to find the merges on
380    */
381   public abstract MergeSpecification findMerges(MergeTrigger mergeTrigger, SegmentInfos segmentInfos, IndexWriter writer)
382       throws IOException;
383 
384   /**
385    * Determine what set of merge operations is necessary in
386    * order to merge to {@code <=} the specified segment count. {@link IndexWriter} calls this when its
387    * {@link IndexWriter#forceMerge} method is called. This call is always
388    * synchronized on the {@link IndexWriter} instance so only one thread at a
389    * time will call this method.
390    * 
391    * @param segmentInfos
392    *          the total set of segments in the index
393    * @param maxSegmentCount
394    *          requested maximum number of segments in the index (currently this
395    *          is always 1)
396    * @param segmentsToMerge
397    *          contains the specific SegmentInfo instances that must be merged
398    *          away. This may be a subset of all
399    *          SegmentInfos.  If the value is True for a
400    *          given SegmentInfo, that means this segment was
401    *          an original segment present in the
402    *          to-be-merged index; else, it was a segment
403    *          produced by a cascaded merge.
404    * @param writer the IndexWriter to find the merges on
405    */
406   public abstract MergeSpecification findForcedMerges(
407           SegmentInfos segmentInfos, int maxSegmentCount, Map<SegmentCommitInfo,Boolean> segmentsToMerge, IndexWriter writer)
408       throws IOException;
409 
410   /**
411    * Determine what set of merge operations is necessary in order to expunge all
412    * deletes from the index.
413    * 
414    * @param segmentInfos
415    *          the total set of segments in the index
416    * @param writer the IndexWriter to find the merges on
417    */
418   public abstract MergeSpecification findForcedDeletesMerges(
419       SegmentInfos segmentInfos, IndexWriter writer) throws IOException;
420 
421   /**
422    * Returns true if a new segment (regardless of its origin) should use the
423    * compound file format. The default implementation returns <code>true</code>
424    * iff the size of the given mergedInfo is less or equal to
425    * {@link #getMaxCFSSegmentSizeMB()} and the size is less or equal to the
426    * TotalIndexSize * {@link #getNoCFSRatio()} otherwise <code>false</code>.
427    */
428   public boolean useCompoundFile(SegmentInfos infos, SegmentCommitInfo mergedInfo, IndexWriter writer) throws IOException {
429     if (getNoCFSRatio() == 0.0) {
430       return false;
431     }
432     long mergedInfoSize = size(mergedInfo, writer);
433     if (mergedInfoSize > maxCFSSegmentSize) {
434       return false;
435     }
436     if (getNoCFSRatio() >= 1.0) {
437       return true;
438     }
439     long totalSize = 0;
440     for (SegmentCommitInfo info : infos) {
441       totalSize += size(info, writer);
442     }
443     return mergedInfoSize <= getNoCFSRatio() * totalSize;
444   }
445   
446   /** Return the byte size of the provided {@link
447    *  SegmentCommitInfo}, pro-rated by percentage of
448    *  non-deleted documents is set. */
449   protected long size(SegmentCommitInfo info, IndexWriter writer) throws IOException {
450     long byteSize = info.sizeInBytes();
451     int delCount = writer.numDeletedDocs(info);
452     double delRatio = info.info.maxDoc() <= 0 ? 0.0f : (float) delCount / (float) info.info.maxDoc();
453     assert delRatio <= 1.0;
454     return (info.info.maxDoc() <= 0 ? byteSize : (long) (byteSize * (1.0 - delRatio)));
455   }
456   
457   /** Returns true if this single info is already fully merged (has no
458    *  pending deletes, is in the same dir as the
459    *  writer, and matches the current compound file setting */
460   protected final boolean isMerged(SegmentInfos infos, SegmentCommitInfo info, IndexWriter writer) throws IOException {
461     assert writer != null;
462     boolean hasDeletions = writer.numDeletedDocs(info) > 0;
463     return !hasDeletions &&
464       info.info.dir == writer.getDirectory() &&
465       useCompoundFile(infos, info, writer) == info.info.getUseCompoundFile();
466   }
467   
468   /** Returns current {@code noCFSRatio}.
469    *
470    *  @see #setNoCFSRatio */
471   public final double getNoCFSRatio() {
472     return noCFSRatio;
473   }
474 
475   /** If a merged segment will be more than this percentage
476    *  of the total size of the index, leave the segment as
477    *  non-compound file even if compound file is enabled.
478    *  Set to 1.0 to always use CFS regardless of merge
479    *  size. */
480   public final void setNoCFSRatio(double noCFSRatio) {
481     if (noCFSRatio < 0.0 || noCFSRatio > 1.0) {
482       throw new IllegalArgumentException("noCFSRatio must be 0.0 to 1.0 inclusive; got " + noCFSRatio);
483     }
484     this.noCFSRatio = noCFSRatio;
485   }
486 
487   /** Returns the largest size allowed for a compound file segment */
488   public final double getMaxCFSSegmentSizeMB() {
489     return maxCFSSegmentSize/1024/1024.;
490   }
491 
492   /** If a merged segment will be more than this value,
493    *  leave the segment as
494    *  non-compound file even if compound file is enabled.
495    *  Set this to Double.POSITIVE_INFINITY (default) and noCFSRatio to 1.0
496    *  to always use CFS regardless of merge size. */
497   public final void setMaxCFSSegmentSizeMB(double v) {
498     if (v < 0.0) {
499       throw new IllegalArgumentException("maxCFSSegmentSizeMB must be >=0 (got " + v + ")");
500     }
501     v *= 1024 * 1024;
502     this.maxCFSSegmentSize = v > Long.MAX_VALUE ? Long.MAX_VALUE : (long) v;
503   }
504 }